{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "import tensorflow as tf\n",
    "from collections import Counter\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "10662\n",
      "10662\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "print(trainset.target_names)\n",
    "print(len(trainset.data))\n",
    "print(len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total words: 197234\n"
     ]
    }
   ],
   "source": [
    "texts = ' '.join(trainset.data)\n",
    "words = texts.split()\n",
    "word2freq = Counter(words)\n",
    "print(\"Total words:\", len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary size: 20465\n"
     ]
    }
   ],
   "source": [
    "_words = set(words)\n",
    "word2idx = {c: i for i, c in enumerate(_words)}\n",
    "idx2word = {i: c for i, c in enumerate(_words)}\n",
    "vocab_size = len(idx2word)\n",
    "indexed = [word2idx[w] for w in words]\n",
    "print('Vocabulary size:', vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "class CBOW:\n",
    "    def __init__(self, sample_size, vocab_size, embedded_size, window_size=3):\n",
    "        self.X = tf.placeholder(tf.int32, shape=[None, 2*window_size])\n",
    "        self.Y = tf.placeholder(tf.int32, shape=[None, 1])\n",
    "        self.embedding = tf.Variable(tf.truncated_normal([vocab_size, embedded_size],\n",
    "                                                      stddev=1.0 / np.sqrt(embedded_size)))\n",
    "        self.bias = tf.Variable(tf.zeros([vocab_size]))\n",
    "        embedded = tf.nn.embedding_lookup(self.embedding, self.X)\n",
    "        embedded = tf.reduce_mean(embedded, axis=1)\n",
    "        self.cost = tf.reduce_mean(tf.nn.nce_loss(\n",
    "            weights=self.embedding,\n",
    "            biases=self.bias,\n",
    "            labels=self.Y,\n",
    "            inputs=embedded,\n",
    "            num_sampled=sample_size,\n",
    "            num_classes=vocab_size))\n",
    "        self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.cost)\n",
    "        self.valid_dataset = tf.placeholder(tf.int32, shape=[None])\n",
    "        norm = tf.sqrt(tf.reduce_sum(tf.square(self.embedding), 1, keep_dims=True))\n",
    "        normalized_embeddings = self.embedding / norm\n",
    "        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, self.valid_dataset)\n",
    "        self.similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "embedded_size = 128\n",
    "window_size = 3\n",
    "epoch = 10\n",
    "valid_size = 10\n",
    "nearest_neighbors = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = CBOW(batch_size,vocab_size,embedded_size)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_x(words, idx):\n",
    "    left = idx - window_size\n",
    "    right = idx + window_size\n",
    "    return words[left: idx] + words[idx+1: right+1]\n",
    "\n",
    "def make_xy(int_words):\n",
    "    x,y = [], []\n",
    "    for i in range(window_size, len(int_words)-window_size):\n",
    "        inputs = get_x(int_words, i)\n",
    "        x.append(inputs)\n",
    "        y.append(int_words[i])\n",
    "    return np.array(x), np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, Y = make_xy(indexed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1, avg loss 115.821989\n",
      "Nearest to best: silly, might, es, youve, such, ultimately, youll, visual,\n",
      "Nearest to in: film, it, not, its, movie, to, this, is,\n",
      "Nearest to as: de, an, through, her, old, dull, been, they,\n",
      "Nearest to science: warm, 1960, pure, winter, da, terrific, unforgettably, bright,\n",
      "Nearest to of: a, the, to, is, it, movie, that, film,\n",
      "Nearest to its: it, in, to, film, movie, is, that, not,\n",
      "Nearest to time: was, since, through, from, before, and, he, no,\n",
      "Nearest to something: new, de, your, her, old, dull, all, hollywood,\n",
      "Nearest to clearly: meanspirited, clichs, staggeringly, ecks, camera, antonio, jordan, anywhere,\n",
      "Nearest to a: of, the, to, is, that, it, movie, film,\n",
      "epoch 2, avg loss 47.058719\n",
      "Nearest to the: of, film, be, a, to, is, it, movie,\n",
      "Nearest to feature: e, offers, michael, being, es, john, our, drama,\n",
      "Nearest to content: depleted, gut, dodgy, laziness, render, positions, bread, progenitor,\n",
      "Nearest to still: act, doesnt, together, de, because, tom, years, fans,\n",
      "Nearest to were: another, dont, when, script, will, seems, any, some,\n",
      "Nearest to are: by, at, will, some, comedy, like, on, i,\n",
      "Nearest to seagals: pinks, shiver, reruns, lack, grandiloquent, redundancies, infantile, tumbleweeds,\n",
      "Nearest to of: film, the, movie, is, to, a, that, in,\n",
      "Nearest to although: o, into, though, may, between, while, where, action,\n",
      "Nearest to something: we, american, de, got, her, last, youve, tv,\n",
      "epoch 3, avg loss 34.011008\n",
      "Nearest to more: like, at, by, are, with, on, i, this,\n",
      "Nearest to but: not, on, at, more, his, like, with, some,\n",
      "Nearest to optimistic: darkest, resuscitation, perplexed, phlegmatic, deleting, handiwork, cheeselaced, seldahls,\n",
      "Nearest to the: to, a, be, it, in, of, that, film,\n",
      "Nearest to made: dont, their, these, over, seems, whats, may, out,\n",
      "Nearest to simple: cowriter, thanks, perhaps, laughoutloud, endless, hill, quickly, deftly,\n",
      "Nearest to on: at, by, like, will, i, with, are, some,\n",
      "Nearest to reduce: gainsbourg, posse, vastness, compensated, sleepwalk, riding, delibrately, debases,\n",
      "Nearest to side: beyond, youd, stock, uma, deftly, plain, intense, moral,\n",
      "Nearest to afraid: besson, literarily, andrews, inseparable, apartheid, slug, knockabout, sappier,\n",
      "epoch 4, avg loss 26.862900\n",
      "Nearest to filmmakers: o, el, gets, john, before, above, es, tom,\n",
      "Nearest to predictable: thriller, robert, delivers, rarely, act, beautiful, offer, them,\n",
      "Nearest to pass: grumble, underwear, sentence, square, muzak, gutterball, crank, wretchedly,\n",
      "Nearest to its: i, film, in, that, be, at, any, like,\n",
      "Nearest to you: will, another, than, he, up, into, make, no,\n",
      "Nearest to portrait: el, hollywood, above, tv, gets, down, o, sex,\n",
      "Nearest to while: between, whose, una, de, here, action, documentary, project,\n",
      "Nearest to sounds: soldier, kittenish, sneering, bons, hovering, guests, dreamlike, overcooked,\n",
      "Nearest to to: any, in, that, his, its, us, on, i,\n",
      "Nearest to connected: preceded, garden, assailants, preach, nonconformist, gradegrubbers, lauded, backward,\n",
      "epoch 5, avg loss 22.830191\n",
      "Nearest to proves: tiempo, inventive, above, experience, paul, inside, him, evil,\n",
      "Nearest to is: by, or, for, who, when, any, like, on,\n",
      "Nearest to humor: those, short, last, adolescent, john, take, while, everything,\n",
      "Nearest to movie: to, in, its, who, that, on, it, has,\n",
      "Nearest to his: he, and, can, so, but, than, only, their,\n",
      "Nearest to class: h, saturday, rare, tragic, su, alive, bartleby, tan,\n",
      "Nearest to has: on, this, i, who, for, or, more, by,\n",
      "Nearest to wintry: munch, sounded, yearnings, dateflick, allenveloping, seriocomic, deer, badder,\n",
      "Nearest to except: frank, max, standard, scratch, fully, balance, took, bartleby,\n",
      "Nearest to delicately: repeatedly, trilogy, laughter, brash, capable, opaque, huskies, arentkidscute,\n",
      "epoch 6, avg loss 20.156413\n",
      "Nearest to for: on, i, this, or, with, its, any, like,\n",
      "Nearest to general: ns, games, delightfully, design, gran, napoleon, walking, rusi,\n",
      "Nearest to hackneyed: betrayal, propulsive, color, scandinavian, source, amiable, wondrously, influences,\n",
      "Nearest to audience: un, que, somewhat, wild, down, style, after, entirely,\n",
      "Nearest to this: on, for, its, film, with, that, i, in,\n",
      "Nearest to the: in, that, this, to, on, a, its, of,\n",
      "Nearest to more: by, are, like, or, comedy, funny, for, i,\n",
      "Nearest to of: in, film, its, on, with, this, that, the,\n",
      "Nearest to takes: tom, entirely, city, visual, looks, portrait, mas, gives,\n",
      "Nearest to bad: though, thats, over, their, between, never, seen, whats,\n",
      "epoch 7, avg loss 17.883237\n",
      "Nearest to and: though, on, or, yet, comedy, love, some, characters,\n",
      "Nearest to by: like, i, at, for, again, with, when, this,\n",
      "Nearest to have: will, you, what, but, not, been, really, too,\n",
      "Nearest to in: movie, film, that, to, its, for, the, of,\n",
      "Nearest to could: was, dont, their, into, after, only, never, also,\n",
      "Nearest to a: movie, its, in, that, film, of, to, the,\n",
      "Nearest to reliable: fling, quirkily, absence, rapturous, pottymouthed, slambang, victories, blender,\n",
      "Nearest to anne: upon, strangely, robert, deftly, se, kung, martial, rob,\n",
      "Nearest to any: even, his, at, has, by, are, so, not,\n",
      "Nearest to pinnacle: tsais, pelculas, gawk, acolytes, gaps, tossup, hooked, etoiles,\n",
      "epoch 8, avg loss 15.352650\n",
      "Nearest to too: some, only, their, will, us, his, there, really,\n",
      "Nearest to creatively: derry, crackers, offthewall, rhino, quiver, doubtful, zip, glosses,\n",
      "Nearest to with: again, this, for, on, at, its, bad, like,\n",
      "Nearest to nothing: then, entertaining, also, plot, can, interesting, far, romantic,\n",
      "Nearest to arctic: campaigntrail, dim, locale, crafting, byatts, webcast, limpid, hail,\n",
      "Nearest to delicately: hardeyed, sources, joel, cristo, clinical, business, notting, gross,\n",
      "Nearest to go: because, your, cant, through, should, seems, think, might,\n",
      "Nearest to the: that, in, movie, its, this, is, a, for,\n",
      "Nearest to aimed: scorseses, chiller, crash, productions, xxx, wallace, reno, friday,\n",
      "Nearest to it: be, i, film, for, movie, that, is, its,\n",
      "epoch 9, avg loss 13.984252\n",
      "Nearest to now: george, y, michael, sex, um, unfunny, perhaps, evil,\n",
      "Nearest to which: exercise, years, her, despite, completely, picture, my, last,\n",
      "Nearest to like: again, on, this, has, i, for, will, how,\n",
      "Nearest to decisions: subgenres, dunce, misunderstood, neofascism, besson, essayist, calories, kafkaesque,\n",
      "Nearest to a: movie, that, it, film, in, the, is, with,\n",
      "Nearest to transfixes: brazil, stiflingly, perplexed, infecting, serrys, goodnaturedness, prod, lessthanthrilling,\n",
      "Nearest to what: will, them, any, was, up, dont, there, performance,\n",
      "Nearest to coming: moving, contrived, powerful, ideas, blade, hilarious, pretty, unexpected,\n",
      "Nearest to as: well, much, long, even, they, them, movies, work,\n",
      "Nearest to trinity: slugfest, vanished, nonbeliever, annoyances, beachcombing, cleverest, skeletons, temerity,\n",
      "epoch 10, avg loss 12.823031\n",
      "Nearest to their: his, are, characters, so, some, by, another, or,\n",
      "Nearest to that: movie, like, in, the, film, this, with, to,\n",
      "Nearest to aficionados: bradbury, helljaunt, factor, hayseedsvs, congeals, spellbinding, hypercliched, cumbersome,\n",
      "Nearest to whose: el, offers, por, en, project, red, la, una,\n",
      "Nearest to a: movie, its, film, in, like, that, with, the,\n",
      "Nearest to 2002: classic, final, giant, version, power, ultimate, nonsense, side,\n",
      "Nearest to grief: gross, observation, bleakness, hypercliched, hayseedsvs, congeals, slavishly, farrelly,\n",
      "Nearest to been: have, would, may, should, up, when, can, better,\n",
      "Nearest to so: their, his, any, are, about, comedy, some, or,\n",
      "Nearest to in: film, movie, of, with, its, that, like, to,\n"
     ]
    }
   ],
   "source": [
    "for i in range(epoch):\n",
    "    total_cost = 0\n",
    "    for k in range(0,(X.shape[0] // batch_size) * batch_size,batch_size):\n",
    "        batch_x = X[k:k+batch_size]\n",
    "        batch_y = Y[k:k+batch_size,np.newaxis]\n",
    "        cost,_ = sess.run([model.cost,model.optimizer],feed_dict={model.X:batch_x,\n",
    "                                                                 model.Y:batch_y})\n",
    "        total_cost += cost\n",
    "    total_cost /= (X.shape[0] // batch_size)\n",
    "    print('epoch %d, avg loss %f'%(i+1,total_cost))\n",
    "    random_valid_size = np.random.choice(indexed, valid_size)\n",
    "    similarity = sess.run(model.similarity,feed_dict={model.valid_dataset:random_valid_size})\n",
    "    for no, i in enumerate(random_valid_size):\n",
    "        valid_word = idx2word[i]\n",
    "        nearest = (-similarity[no, :]).argsort()[1:nearest_neighbors + 1]\n",
    "        log_str = 'Nearest to %s:' % valid_word\n",
    "        for k in range(nearest_neighbors):\n",
    "            close_word = idx2word[nearest[k]]\n",
    "            log_str = '%s %s,' % (log_str, close_word)\n",
    "        print(log_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
